import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
titanic_data = pd.read_csv('titanic_train.csv')
sns.countplot(x='Survived', data=titanic_data)
<AxesSubplot:xlabel='Survived', ylabel='count'>
sns.countplot(x='Survived', hue='Sex', data=titanic_data)
<AxesSubplot:xlabel='Survived', ylabel='count'>
sns.countplot(x='Survived', hue='Pclass', data=titanic_data)
<AxesSubplot:xlabel='Survived', ylabel='count'>
plt.hist(titanic_data['Age'].dropna())
(array([ 54., 46., 177., 169., 118., 70., 45., 24., 9., 2.]),
array([ 0.42 , 8.378, 16.336, 24.294, 32.252, 40.21 , 48.168, 56.126,
64.084, 72.042, 80. ]),
<BarContainer object of 10 artists>)
plt.hist(titanic_data['Fare'])
(array([732., 106., 31., 2., 11., 6., 0., 0., 0., 3.]),
array([ 0. , 51.23292, 102.46584, 153.69876, 204.93168, 256.1646 ,
307.39752, 358.63044, 409.86336, 461.09628, 512.3292 ]),
<BarContainer object of 10 artists>)
titanic_data.isnull()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | True | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | True | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | True | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | False | False | False | False | False | False | False | False | False | False | True | False |
| 887 | False | False | False | False | False | False | False | False | False | False | False | False |
| 888 | False | False | False | False | False | True | False | False | False | False | True | False |
| 889 | False | False | False | False | False | False | False | False | False | False | False | False |
| 890 | False | False | False | False | False | False | False | False | False | False | True | False |
891 rows × 12 columns
sns.heatmap(titanic_data.isnull(), cbar=False)
<AxesSubplot:>
sns.boxplot(titanic_data['Pclass'], titanic_data['Age'])
C:\Users\DELL\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Pclass', ylabel='Age'>
#Pclass value 1
titanic_data[titanic_data['Pclass'] == 1]['Age'].mean()
#Pclass value 2
titanic_data[titanic_data['Pclass'] == 2]['Age'].mean()
#Pclass 3
titanic_data[titanic_data['Pclass'] == 2]['Age'].mean()
29.87763005780347
def impute_missing_age(columns):
age = columns[0]
passenger_class = columns[1]
if pd.isnull(age):
if(passenger_class == 1):
return titanic_data[titanic_data['Pclass'] == 1]['Age'].mean()
elif(passenger_class == 2):
return titanic_data[titanic_data['Pclass'] == 2]['Age'].mean()
elif(passenger_class == 3):
return titanic_data[titanic_data['Pclass'] == 3]['Age'].mean()
else:
return age
titanic_data['Age'] = titanic_data[['Age', 'Pclass']].apply(impute_missing_age, axis = 1)
sns.heatmap(titanic_data.isnull(), cbar=False)
<AxesSubplot:>
titanic_data.drop('Cabin', axis=1, inplace = True)
titanic_data.dropna(inplace = True)
pd.get_dummies(titanic_data['Sex'])
| female | male | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 1 | 0 |
| 2 | 1 | 0 |
| 3 | 1 | 0 |
| 4 | 0 | 1 |
| ... | ... | ... |
| 886 | 0 | 1 |
| 887 | 1 | 0 |
| 888 | 1 | 0 |
| 889 | 0 | 1 |
| 890 | 0 | 1 |
889 rows × 2 columns
pd.get_dummies(titanic_data['Sex'], drop_first = True)
| male | |
|---|---|
| 0 | 1 |
| 1 | 0 |
| 2 | 0 |
| 3 | 0 |
| 4 | 1 |
| ... | ... |
| 886 | 1 |
| 887 | 0 |
| 888 | 0 |
| 889 | 1 |
| 890 | 1 |
889 rows × 1 columns
sex_data = pd.get_dummies(titanic_data['Sex'], drop_first = True)
embarked_data = pd.get_dummies(titanic_data['Embarked'], drop_first = True)
titanic_data = pd.concat([titanic_data, sex_data, embarked_data], axis = 1)
titanic_data.drop(['Name', 'Ticket', 'Sex', 'Embarked'], axis = 1, inplace = True)
y_data = titanic_data['Survived']
x_data = titanic_data.drop('Survived', axis = 1)
from sklearn.model_selection import train_test_split
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x_data, y_data, test_size = 0.3)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_training_data, y_training_data)
C:\Users\DELL\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
LogisticRegression()
predictions = model.predict(x_test_data)
from sklearn.metrics import classification_report
classification_report(y_test_data, predictions)
' precision recall f1-score support\n\n 0 0.79 0.86 0.82 159\n 1 0.76 0.66 0.71 108\n\n accuracy 0.78 267\n macro avg 0.78 0.76 0.76 267\nweighted avg 0.78 0.78 0.78 267\n'
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test_data, predictions))
[[137 22] [ 37 71]]